** This do.file builds shocks in household space **

cd "D:\Dropbox\unequal_gains\QJE revision plan\analysis\"
global Section4 "D:\Dropbox\unequal_gains\QJE revision plan\analysis\section4_data"

*** 1. Build baseline shocks: Age by education by race by state by children

use ipums_feb2018, clear
* (IPUMS freely available online)

* keep only household_head
keep if related==101

* keep years 2000-2004 & 2012-2016
keep if inrange(year,2000,2004) | inrange(year,2012,2016)
gen period=.
replace period=1 if inrange(year,2000,2004)
replace period=2 if inrange(year,2012,2016)

* create education variable
drop if missing(educd)
gen education=.
* below high school
replace education=1 if educd<62
* high school graduate
replace education=2 if missing(education) & educd<65
* some college 
replace education=3 if missing(education) & educd<101
* college grad 
replace education=4 if missing(education) & educd<114
* post college grad 
replace education=5 if educd>113

* create race variable
drop if missing(race)
rename race race_old
gen race=.
* White
replace race=1 if race_old==1
* Black
replace race=2 if race_old==2
* Other
replace race=3 if race_old>2

* presence of children
drop if missing(nchild) 
gen children=.
replace children=1 if nchild>0
replace children=0 if nchild==0

* generate age bins
drop if missing(age)
drop if age<20
gen age_bin=.
replace age_bin = 1 if age<26
replace age_bin = 2 if age<31 & missing(age_bin)
replace age_bin = 3 if age<36 & missing(age_bin)
replace age_bin = 4 if age<41 & missing(age_bin)
replace age_bin = 5 if age<46 & missing(age_bin)
replace age_bin = 6 if age<51 & missing(age_bin)
replace age_bin = 7 if age<56 & missing(age_bin)
replace age_bin = 8 if age<61 & missing(age_bin)
replace age_bin = 9 if age<66 & missing(age_bin)
replace age_bin = 10 if age<71 & missing(age_bin)
replace age_bin = 11 if age<76 & missing(age_bin)
replace age_bin = 12 if age>75 & missing(age_bin)

* now collapse by age and education 
collapse (sum) hhwt, by(period age_bin education statefip race children)

* document household growth rates
egen hh_group=group(age_bin education statefip race children)
tsset hh_group period
gen double P=hhw
gen double P_initial=L.hhw
gen double log_P=log(hhw)
gen double log_P_initial=L.log_P
gen g_n= log_P - L.log_P

drop if missing(g_n)
keep g_n age education hh_group log_P log_P_initial statefip race children P P_initial

* transform into annual shocks:
gen g_n_annual = g_n/(2016-2004)

save "$Section4/age_educ_race_children_state_groups", replace

*** 2. Residualize with linear age trends & various f.e.
*** after restricting to age-educ-race-children-state groups observed in Nielsen

use "$Section4/age_educ_race_children_state_groups", clear

merge 1:1 age education statefip race children using "$Section4/age_educ_race_children_state_groups_used"
* we lose quite a few groups (which exist in IPUMS but are not observed in Nielsen data)
keep if _merge==3
drop _merge

sum g_n_annual, d
sum g_n_annual [aw=log((P+P_initial)/2)], d
gen mean_g_n_annual=r(mean)

* linear age controls
regress g_n_annual age_bin [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_linageres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_linageres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* age fixed effects
regress g_n_annual i.age_bin [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_ageres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_ageres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* age & education fixed effects
regress g_n_annual i.age_bin i.education [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_ageeducres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_ageeducres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* age & education & state fixed effects
regress g_n_annual i.age_bin i.education i.statefip [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_ageeducstateres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_ageeducstateres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* age & education & state & race & children fixed effects
regress g_n_annual i.age_bin i.education i.statefip i.race i.children [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_allres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_allres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* education fixed effects
regress g_n_annual i.education [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_educres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_educres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* linear age controls & education fixed effects
regress g_n_annual age i.education [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_linageducres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_linageducres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* state fixed effects
regress g_n_annual i.statefip [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_stateres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_stateres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* race fixed effects
regress g_n_annual i.race [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_raceres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_raceres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* children fixed effects
regress g_n_annual i.children [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_childrenres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_childrenres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* linear age controls & state fixed effects
regress g_n_annual age i.statefip [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_linagestateres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_linagestateres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* linear age & race fixed effects
regress g_n_annual age_bin i.race [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_linageraceres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_linageraceres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

* linear age & children fixed effects
regress g_n_annual age_bin i.children [aw=log((P+P_initial)/2)], r
predict g_n_annual_hat 
gen g_n_annual_linagechildres=g_n_annual-g_n_annual_hat + mean_g_n_annual
sum g_n_annual_linagechildres [aw=log((P+P_initial)/2)], d
drop g_n_annual_hat

order g_n_annual*
save "$Section4/age_educ_race_children_state_groups_final", replace
